Part 2 of the analysis. Continued directly from KP_model_data_filtering_part1.ipynb.
print(filtered_assn.unique())
print(len(filtered_assn.unique()))
['B cell' 'Mono/Mac/DC' 'T/NK cell' 'Neutrophil' 'Low Quality' 'Basophil' 'Fibroblast' 'Lymphatic Endothelial' 'Blood Endothelial' 'AT1/AT2' 'Platelet' 'Ciliated'] 12
# save the lineage specific embeddings
for lin in filtered_assn.unique():
lin_path = lin.replace(' ','_').replace('/','_')
lineage_pcaproj_dict[lin].to_csv('/Users/rs3380/Dropbox/lung_tumor/new_folders/output/exp2/'
+ lin_path + '_pca.csv')
# load tSNE results:
# Used 20 PCA, perplexity = 50
lineage_tsneproj_dict = {}
for lin in filtered_assn.unique():
lin_path = lin.replace(' ','_').replace('/','_')
lineage_tsneproj_dict[lin] = pd.read_csv('/Users/rs3380/Dropbox/lung_tumor/results/' +
'coarse_assn_exp2_050419/filtered_lineage_embedding/' +
lin_path + '_tsne.csv', index_col = 0)
# library size for each sample individually
fig = plt.figure(figsize = (8*4, 6*3))
for i,fval in enumerate(filtered_assn.unique()):
ax = fig.add_subplot(3, 4, i+1)
ax.scatter(lineage_tsneproj_dict[fval].iloc[:,0], lineage_tsneproj_dict[fval].iloc[:,1],
s=10, edgecolors='none',
color=cols_use[i])
ax.set_title(fval,fontsize=20)
#ax = plt.gca()
ax.axis('equal')
ax.set_axis_off()
# plt.colorbar().set_label(fval)
#pu.make_legend(ax,condition_cdict,'upper left',(1,1))
#plt.show()
FIBSELECT = filtered_assn == 'Fibroblast'
fib_normlog = filtered_normlog.loc[FIBSELECT,:]
fib_info = filtered_info.loc[FIBSELECT,:]
fib_count = filtered_count.loc[FIBSELECT,:]
fib_normcount = filtered_normcount.loc[FIBSELECT,:]
fib_confounders = confounders.loc[fib_info.index, :]
plt.hist(np.log10(fib_info['libsize']), 100);
plt.axvline(3.4, c = 'r')
plt.xlabel('log10(library size)')
plt.ylabel('Frequency')
Text(0, 0.5, 'Frequency')
FIBCLEANSELECT = (np.log10(fib_info['libsize']) > 3.4) & (fib_confounders['scrublet_predict'] == False)
# clean fibroblast, genes with expression in >10 cells
GSELECT = (counts_df.loc[FIBCLEANSELECT.index[FIBCLEANSELECT]] > 0).sum(axis=0) > 10
fib_clean_count = counts_df.loc[FIBCLEANSELECT.index[FIBCLEANSELECT],GSELECT]
fib_clean_normlog = normlog_df.loc[FIBCLEANSELECT.index[FIBCLEANSELECT],GSELECT]
fib_clean_info = fib_info.loc[FIBCLEANSELECT.index[FIBCLEANSELECT]]
#fib_clean_normlog = fib_normlog.loc[FIBCLEANSELECT,:]
#fib_clean_info = fib_info.loc[FIBCLEANSELECT,:]
#fib_clean_count = fib_count.loc[FIBCLEANSELECT,:]
#fib_clean_normcount = fib_normcount.loc[FIBCLEANSELECT,:]
print(fib_clean_count.shape)
print(fib_clean_normlog.shape)
(3855, 13588) (3855, 13588)
fib_normlog.shape
(4496, 16435)
# check with the filtered fibroblasts file:
fib_clean_info_saved = pd.read_csv('/Users/rs3380/Dropbox/lung_tumor/lineage_analysis_061019/fibroblast_analysis/fibroblast_data/fibroblast_clean_info_061019.csv', index_col = 0)
print(fib_clean_info.shape)
print(fib_clean_info_saved.shape)
(3855, 8) (3855, 8)
print(set(fib_clean_info.index).difference(set(fib_clean_info_saved.index)))
print(set(fib_clean_info_saved.index).difference(set(fib_clean_info.index)))
set() set()
print(np.sum(np.abs(fib_clean_info_saved['ncells'] - fib_clean_info.loc[fib_clean_info_saved.index, 'ncells'])))
0
pca_clean = PCA(n_components=1000, svd_solver='randomized')
clean_pcaproj = pd.DataFrame(pca_clean.fit_transform(fib_clean_normlog),
index=fib_clean_normlog.index)
tsne = TSNE(n_jobs=6, random_state=1234, perplexity=100,verbose=True)
clean_tsneproj = pd.DataFrame(tsne.fit_transform(clean_pcaproj.iloc[:,0:50]),
index=clean_pcaproj.index, columns=['x','y'])
plt.scatter(clean_tsneproj['x'], clean_tsneproj['y'], c = np.log10(fib_clean_info['libsize'].values), s = 1,
cmap = 'Spectral_r')
plt.axis('off');
fib_cluster = pd.read_csv('/Users/rs3380/Dropbox/lung_tumor/lineage_analysis_061019/fibroblast_analysis/fibroblast_data/fibroblast_cluster_50PCs_k30.csv', header=None, index_col=0, squeeze=True,
names=['cluster'])#
communities = {}
for k in [20, 25, 30, 35, 40, 45]:
print(k)
communities[str(k)], _, _ = phenograph.cluster(clean_pcaproj.iloc[:,0:50],k=k,
clustering_algo = 'leiden',
resolution_parameter = 1)
#filtered_cluster = pd.Series(communities,index=filtered_pcaproj.index, name='cluster')
20 Finding 20 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.3381969928741455 seconds Jaccard graph constructed in 2.6930811405181885 seconds Running Leiden optimization Leiden completed in 0.23665690422058105 seconds Sorting communities by size, please wait ... PhenoGraph completed in 5.136100769042969 seconds 25 Finding 25 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.473635196685791 seconds Jaccard graph constructed in 4.3960418701171875 seconds Running Leiden optimization Leiden completed in 0.3155040740966797 seconds Sorting communities by size, please wait ... PhenoGraph completed in 7.425246953964233 seconds 30 Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.4610300064086914 seconds Jaccard graph constructed in 3.7542810440063477 seconds Running Leiden optimization Leiden completed in 0.6445629596710205 seconds Sorting communities by size, please wait ... PhenoGraph completed in 6.602176904678345 seconds 35 Finding 35 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.462785005569458 seconds Jaccard graph constructed in 4.099690914154053 seconds Running Leiden optimization Leiden completed in 0.4050581455230713 seconds Sorting communities by size, please wait ... PhenoGraph completed in 7.183154821395874 seconds 40 Finding 40 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.49021005630493164 seconds Jaccard graph constructed in 4.554915904998779 seconds Running Leiden optimization Leiden completed in 0.4875969886779785 seconds Sorting communities by size, please wait ... PhenoGraph completed in 7.3800859451293945 seconds 45 Finding 45 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.49616003036499023 seconds Jaccard graph constructed in 4.436186075210571 seconds Running Leiden optimization Leiden completed in 0.4367029666900635 seconds Sorting communities by size, please wait ... PhenoGraph completed in 7.7283830642700195 seconds
res_mat = np.zeros(shape = (6, 6))
for j, item in enumerate(communities.keys()):
for k, item2 in enumerate(communities.keys()):
res_mat[j, k] = adjusted_rand_score(communities[item], communities[item2])
plt.figure(figsize= (8, 6))
plt.imshow(res_mat, vmin = 0, vmax = 1, cmap = 'magma_r')
plt.colorbar()
plt.xticks(range(6), communities.keys());
plt.yticks(range(6), communities.keys());
#plt.grid()#which='minor', color='w', linestyle='-', linewidth=5)
plt.xlabel('k')
plt.ylabel('k')
plt.title('Adjusted Rand Index')
Text(0.5, 1.0, 'Adjusted Rand Index')
mt_genes = ['MT-' in j for j in fib_clean_count.columns]
mt_score = np.sum(fib_clean_count.iloc[:, mt_genes], axis = 1)/np.sum(fib_clean_count, axis = 1)
fig = plt.figure(figsize = (8*2, 6*1))
ax = fig.add_subplot(1, 2, 1)
im1 = ax.scatter(clean_tsneproj['x'], clean_tsneproj['y'], c = np.log10(fib_clean_info['libsize'].values), s = 1,
cmap = 'Spectral_r')
fig.colorbar(im1)
ax.axis('off');
ax.set_title('Colored by library size');
ax = fig.add_subplot(1, 2, 2)
im1 = ax.scatter(clean_tsneproj['x'], clean_tsneproj['y'], c = mt_score*100, s = 1, cmap = 'Spectral_r')
fig.colorbar(im1)
ax.axis('off');
ax.set_title('Colored by MT- content');
FSELECT = fib_cluster.isin([11])
filtered_fib_count = fib_clean_count.loc[~FSELECT,:]
filtered_fib_normlog = fib_clean_normlog.loc[~FSELECT,:]
filtered_fib_info = fib_clean_info.loc[~FSELECT,:]
filtered_fib_confounders = fib_confounders.loc[FSELECT.index[~FSELECT],:]
filtered_fib_count.shape
(3791, 13588)
# Check with stored info
filtered_fib_info_saved = pd.read_csv('/Users/rs3380/Dropbox/lung_tumor/lineage_analysis_061019/fibroblast_analysis_v2/fib_data/fib_clean_info_050420.csv', index_col = 0)
print(filtered_fib_info.shape)
print(filtered_fib_info_saved.shape)
(3791, 8) (3791, 8)
print(set(filtered_fib_info.index).difference(set(filtered_fib_info_saved.index)))
print(set(filtered_fib_info_saved.index).difference(set(filtered_fib_info.index)))
set() set()
print(np.sum(np.abs(filtered_fib_info_saved['libsize'] - filtered_fib_info.loc[filtered_fib_info_saved.index, 'libsize'])))
0.0
pca_fib = PCA(n_components=1000, svd_solver='randomized')
filtered_fib_pcaproj = pd.DataFrame(pca_fib.fit_transform(filtered_fib_normlog),
index=filtered_fib_normlog.index)
tsne = TSNE(n_jobs=6, random_state=1234, perplexity=100, n_iter=5000, verbose=True)
filtered_fib_tsneproj = pd.DataFrame(tsne.fit_transform(filtered_fib_pcaproj.iloc[:,0:50]),
index=filtered_fib_pcaproj.index, columns=['x','y'])
filtered_fib_cluster = phenograph.cluster(filtered_fib_pcaproj.iloc[:, 0:50], k = 30)
filtered_fib_cluster_saved = pd.read_csv('/Users/rs3380/Dropbox/lung_tumor/lineage_analysis_061019/fibroblast_analysis_v2/fib_data/fib_filtered_phenograph_50PCs_k30.csv',
index_col = 0, header = None)
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.2441258430480957 seconds Jaccard graph constructed in 2.2944560050964355 seconds Wrote graph to binary file in 0.07903623580932617 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.822081 After 6 runs, maximum modularity is Q = 0.823368 After 18 runs, maximum modularity is Q = 0.825055 Louvain completed 38 runs in 5.54936408996582 seconds Sorting communities by size, please wait ... PhenoGraph completed in 9.582306861877441 seconds
communities = {}
for k in [20, 25, 30, 35, 40, 45]:
print(k)
communities[str(k)], _, _ = phenograph.cluster(filtered_fib_pcaproj.iloc[:,0:50],k=k,
clustering_algo = 'leiden',
resolution_parameter = 1)
#filtered_cluster = pd.Series(communities,index=filtered_pcaproj.index, name='cluster')
20 Finding 20 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.39167284965515137 seconds Jaccard graph constructed in 5.596827983856201 seconds Running Leiden optimization Leiden completed in 0.5714268684387207 seconds Sorting communities by size, please wait ... PhenoGraph completed in 12.64493703842163 seconds 25 Finding 25 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.6508488655090332 seconds Jaccard graph constructed in 5.157021999359131 seconds Running Leiden optimization Leiden completed in 0.3395040035247803 seconds Sorting communities by size, please wait ... PhenoGraph completed in 8.707941770553589 seconds 30 Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.6161301136016846 seconds Jaccard graph constructed in 9.769864082336426 seconds Running Leiden optimization Leiden completed in 0.8562829494476318 seconds Sorting communities by size, please wait ... PhenoGraph completed in 15.560796022415161 seconds 35 Finding 35 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.6402108669281006 seconds Jaccard graph constructed in 9.590893030166626 seconds Running Leiden optimization Leiden completed in 0.7870767116546631 seconds Sorting communities by size, please wait ... PhenoGraph completed in 16.208773136138916 seconds 40 Finding 40 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.6953847408294678 seconds Jaccard graph constructed in 6.331328868865967 seconds Running Leiden optimization Leiden completed in 0.6546969413757324 seconds Sorting communities by size, please wait ... PhenoGraph completed in 10.580172061920166 seconds 45 Finding 45 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.5574121475219727 seconds Jaccard graph constructed in 5.593844175338745 seconds Running Leiden optimization Leiden completed in 1.1461048126220703 seconds Sorting communities by size, please wait ... PhenoGraph completed in 10.36968994140625 seconds
res_mat = np.zeros(shape = (6, 6))
for j, item in enumerate(communities.keys()):
for k, item2 in enumerate(communities.keys()):
res_mat[j, k] = adjusted_rand_score(communities[item], communities[item2])
plt.figure(figsize= (8, 6))
plt.imshow(res_mat, vmin = 0, vmax = 1, cmap = 'magma_r')
plt.colorbar()
plt.xticks(range(6), communities.keys());
plt.yticks(range(6), communities.keys());
#plt.grid()#which='minor', color='w', linestyle='-', linewidth=5)
plt.xlabel('k')
plt.ylabel('k')
plt.title('Adjusted Rand Index')
Text(0.5, 1.0, 'Adjusted Rand Index')
# edited down
# Travagliania: alveolar vs adventitial,
# most of these are not consistent, perhaps most matrix are adventitial by PI16 and SERPINF1
FIBMARKERS = ['PDPN', 'COL1A1', 'COL1A2', 'BSG', 'PDGFRA', 'ELN', 'BGN', # general
'FN1','POSTN','FAP','S100A4','COL13A1','COL14A1',
'FGFR4', 'GPC3', 'TBX2', 'ITGA8', 'VEGFD', 'SLC7A10', # alveolar
'PI16', 'SERPINF1', 'SCARA5', 'MFAP5', 'IGFBP4', 'C3', 'ENTPD2', # vascular adventitial, airway
'LPL','PLIN2','LIPA','FABP1','FABP4','FABP5','APOE','FST', # lipofiboblast
'PDGFRB', 'CSPG4', 'MCAM', 'TRPC6', 'HIGD1B', 'VTN', 'COX4I2', # pericyte
'WT1', 'UPK3B', 'MSLN', 'GATA6', # mesothelial
'ACTA2', 'WIF1', 'FGF18', 'ASPN', 'MYH11', 'CNN1', 'TAGLN' # myofibroblast
]
len(FIBMARKERS)
52
fig = plt.figure(figsize = (8*4, 6*13))
for j, item in enumerate(FIBMARKERS):
ax = fig.add_subplot(13, 4, j + 1)
c = filtered_fib_normlog[item]
im1 = ax.scatter(filtered_fib_tsneproj['x'], filtered_fib_tsneproj['y'], s = 3,
c = c, vmin = np.percentile(c, 1), vmax = np.percentile(c, 99))
ax.axis('off')
ax.set_title(item)
fig.colorbar(im1)
FIBMARKERS_dict = {'General': ['PDPN', 'COL1A1', 'COL1A2', 'BSG', 'PDGFRA', 'ELN', 'BGN'],
'Alveolar': ['FN1','POSTN','FAP','S100A4','COL13A1','COL14A1', 'FGFR4', 'GPC3', 'TBX2', 'ITGA8', 'VEGFD', 'SLC7A10'],
'Vascular Adventitial': ['PI16', 'SERPINF1', 'SCARA5', 'MFAP5', 'IGFBP4', 'C3', 'ENTPD2'], # artery
'Lipo Fibroblast': ['LPL','PLIN2','LIPA','FABP1','FABP4','FABP5','APOE','FST'], # large vessel, vein
'Pericyte': ['PDGFRB', 'CSPG4', 'MCAM', 'TRPC6', 'HIGD1B', 'VTN', 'COX4I2'], # capillary 2
'Mesothelial': ['WT1', 'UPK3B', 'MSLN', 'GATA6'], # capillary 1
'Myofibroblast': ['ACTA2', 'WIF1', 'FGF18', 'ASPN', 'MYH11', 'CNN1', 'TAGLN'] # lymphatic
}
col_markers = []
ct = 0
for item, values in FIBMARKERS_dict.items():
col_markers = col_markers + [cols_use[ct]]*len(values)
ct = ct + 1
fib_sub_data = filtered_fib_normlog[FIBMARKERS].copy()
fib_sub_data['Cluster'] = list(filtered_fib_cluster_saved.values.flatten())
fib_sub_data_avg = fib_sub_data.groupby('Cluster').mean()
import matplotlib as mpl
mpl.style.use('default')
g = sns.clustermap(fib_sub_data_avg, standard_scale = 1, col_colors = col_markers, col_cluster = False,
figsize = (18, 6), linewidth = 0.2, cbar_pos=(1, .3, .01, .4))
# Draw the legend bar for the classes
for ct, label in enumerate([j for j in FIBMARKERS_dict.keys()]):
g.ax_col_dendrogram.bar(0, 0, color=cols_use[ct],
label=label, linewidth=0)
g.ax_col_dendrogram.legend(ncol=1, bbox_to_anchor = (0, 0.8))
<matplotlib.legend.Legend at 0x7fddb6f0e100>
res = utils.run_diffusion_maps(filtered_fib_pcaproj.iloc[:,0:50], knn = 30)
Determing nearest neighbor graph using euclidean distance metric
def impute_expression(T,expression,TSTEPS=4):
T_step = T
# impute values first, then keep stepping to speed it up
T_imputed = T_step.dot(expression.values)
for i in range(1,TSTEPS):
print('step {}'.format(i+1))
T_imputed = T_step.dot(T_imputed)
expression_imputed = pd.DataFrame(T_imputed,index=expression.index, columns=expression.columns)
return expression_imputed
filtered_fib_imputed = impute_expression(res['T'], filtered_fib_normlog, TSTEPS=4)
step 2 step 3 step 4
fig = plt.figure(figsize = (8*4, 6*13))
for j, item in enumerate(FIBMARKERS):
ax = fig.add_subplot(13, 4, j + 1)
c = filtered_fib_imputed[item]
im1 = ax.scatter(filtered_fib_tsneproj['x'], filtered_fib_tsneproj['y'],
c = c, cmap = 'Spectral_r')
ax.axis('off')
ax.set_title(item)
fig.colorbar(im1)
fib_sub_data = filtered_fib_imputed[FIBMARKERS].copy()
fib_sub_data['Cluster'] = list(filtered_fib_cluster_saved.values.flatten())
fib_sub_data_avg = fib_sub_data.groupby('Cluster').mean()
import matplotlib as mpl
mpl.style.use('default')
g = sns.clustermap(fib_sub_data_avg, standard_scale = 1, col_colors = col_markers, col_cluster = False,
figsize = (18, 6), linewidth = 0.2, cbar_pos=(1, .3, .01, .4))
# Draw the legend bar for the classes
for ct, label in enumerate([j for j in FIBMARKERS_dict.keys()]):
g.ax_col_dendrogram.bar(0, 0, color=cols_use[ct],
label=label, linewidth=0)
g.ax_col_dendrogram.legend(ncol=2, bbox_to_anchor = (0, 0.8))
<matplotlib.legend.Legend at 0x7fdb4ccd5040>
fib_coarse_assn = {'COL14A1 Matrix':[0,3,6,9,11],
'COL13A1 Matrix':[4],
'Pericyte':[1,7],
'Myofibroblast':[2,5,12],
'MSC':[8],
'Mesothelial':[10]}
# individual cell assignments
fib_coarse_assn_clustmap = {}
for a,clusts in fib_coarse_assn.items():
for c in clusts:
fib_coarse_assn_clustmap[c] = a
fib_coarse = pd.Series(fib_cluster.map(fib_coarse_assn_clustmap),name='assignment')
fig = plt.figure(figsize = (8*2, 6*1))
ax = fig.add_subplot(1, 2, 1)
old_cluster = filtered_fib_cluster_saved.values.flatten()
for j in np.unique(old_cluster):
ax.scatter(filtered_fib_tsneproj['x'][old_cluster == j],
filtered_fib_tsneproj['y'][old_cluster == j],
s = 1, c = cols[j], label = str(j))
ax.legend(markerscale = 5, fontsize = 10, ncol = 2)
ax.axis('off');
ax.set_title('Old clusters')
ax = fig.add_subplot(1, 2, 2)
new_cluster = fib_coarse
for j, item in enumerate(np.unique(new_cluster)):
ax.scatter(filtered_fib_tsneproj['x'][new_cluster == item],
filtered_fib_tsneproj['y'][new_cluster == item],
s = 1, c = cols[j], label = item)
ax.legend(markerscale = 5, fontsize = 10, ncol = 1)
ax.axis('off');
ax.set_title('New clusters')
Text(0.5, 1.0, 'New clusters')
# lymphatic or blood endothlial, genes with expression in >10 cells
EINDICES = filtered_assn.loc[filtered_assn.isin(['Lymphatic Endothelial', 'Blood Endothelial'])].index
GSELECT = (counts_df.loc[EINDICES,:] > 0).sum(axis=0) > 10
endo_count = counts_df.loc[EINDICES,GSELECT]
endo_normlog = normlog_df.loc[EINDICES,GSELECT]
endo_count.shape
(3333, 12131)
endo_info = filtered_info.loc[EINDICES,:]
endo_confounders = confounders.loc[EINDICES,:]
pca_endo = PCA(n_components=1000, svd_solver='randomized')
endo_pcaproj = pd.DataFrame(pca_endo.fit_transform(endo_normlog),
index=endo_normlog.index)
tsne = TSNE(n_jobs=6, random_state=1234, perplexity=100,verbose=True)
endo_tsneproj = pd.DataFrame(tsne.fit_transform(endo_pcaproj.iloc[:,0:50]),
index=endo_pcaproj.index, columns=['x','y'])
plt.scatter(endo_tsneproj['x'], endo_tsneproj['y'], s = 1, c= np.log10(endo_info['libsize']))
plt.colorbar()
<matplotlib.colorbar.Colorbar at 0x7fde871c57f0>
endo_cluster_saved = pd.read_csv('/Users/rs3380/Dropbox/lung_tumor/lineage_analysis_061019/endothelial_analysis/endo_data/endo_full_leiden_50PCs_k30_res1.csv',
index_col=0, header=None, squeeze=True, names=['cluster'])
np.log10(endo_confounders['libsize'].values)
array([3.63265971, 2.70500796, 3.62221402, ..., 3.45666963, 3.57966929,
3.42618583])
endo_confounders_sub = endo_confounders[['mito_fraction', 'ribo_fraction']].copy()
endo_confounders_sub['log_libsize'] = list(np.log10(endo_confounders['libsize'].values))
endo_confounders_sub['Cluster'] = endo_cluster_saved.loc[endo_confounders_sub.index]
endo_confounders_sub_avg = endo_confounders_sub.groupby('Cluster').mean()
sns.clustermap(endo_confounders_sub_avg, standard_scale = 1)
<seaborn.matrix.ClusterGrid at 0x7fde86326910>
np.log10(endo_confounders['libsize'])
0
120703409145716001 3.632660
120703409348020001 2.705008
120703409646299001 3.622214
120703436052205001 3.008174
120726943315830001 3.568905
...
241057700101405011 3.556303
241106421208805011 3.377670
241114562291571011 3.456670
241114589027246011 3.579669
241114589063987011 3.426186
Name: libsize, Length: 3333, dtype: float64
fig = plt.figure(figsize = (8*2, 6*2))
ax = fig.add_subplot(2, 2, 1)
old_cluster = endo_cluster_saved.values
for j in np.unique(old_cluster):
ax.scatter(endo_tsneproj['x'][old_cluster == j], endo_tsneproj['y'][old_cluster == j],
s = 1, c = cols[j], label = str(j))
ax.legend(markerscale = 5, fontsize = 10, ncol = 2)
ax.axis('off');
ax.set_title('Clusters')
ax = fig.add_subplot(2, 2, 2)
im0 = ax.scatter(endo_tsneproj['x'], endo_tsneproj['y'],
s = 1, c = np.log10(endo_confounders['libsize']))
ax.axis('off');
ax.set_title('log10(library size)')
fig.colorbar(im0)
ax = fig.add_subplot(2, 2, 3)
im1 = ax.scatter(endo_tsneproj['x'], endo_tsneproj['y'],
s = 1, c = endo_confounders['mito_fraction'])
ax.axis('off');
ax.set_title('MT- frac')
fig.colorbar(im1)
ax = fig.add_subplot(2, 2, 4)
im2 = ax.scatter(endo_tsneproj['x'], endo_tsneproj['y'],
s = 1, c = endo_confounders['ribo_fraction'])
ax.axis('off');
ax.set_title('Ribo frac')
fig.colorbar(im2)
<matplotlib.colorbar.Colorbar at 0x7fde8741a820>
FSELECT = (endo_info['libsize'] < 1000) | endo_cluster_saved.isin([6,7])
filtered_endo_count = endo_count.loc[~FSELECT,:]
filtered_endo_normlog = endo_normlog.loc[~FSELECT,:]
filtered_endo_info = endo_info.loc[~FSELECT,:]
filtered_endo_confounders = endo_confounders.loc[~FSELECT,:]
pca_endo = PCA(n_components=1000, svd_solver='randomized')
filtered_endo_pcaproj = pd.DataFrame(pca_endo.fit_transform(filtered_endo_normlog),
index=filtered_endo_normlog.index)
tsne = TSNE(n_jobs=6, random_state=1234, perplexity=100,verbose=True)
filtered_endo_tsneproj = pd.DataFrame(tsne.fit_transform(filtered_endo_pcaproj.iloc[:,0:50]),
index=filtered_endo_pcaproj.index, columns=['x','y'])
filtered_endo_tsneproj_saved = pd.read_csv('/Users/rs3380/Dropbox/lung_tumor/lineage_analysis_061019/endothelial_analysis/endo_data/endo_clean_tsneproj_50PCs_p100.csv', index_col = 0)
fig = plt.figure(figsize = (8*2, 6*1))
ax = fig.add_subplot(1, 2, 1)
im0 = ax.scatter(filtered_endo_tsneproj_saved['x'], filtered_endo_tsneproj_saved['y'],
s = 1, c = np.log10(filtered_endo_confounders['libsize']))
ax.axis('off');
ax.set_title('log10(library size)')
fig.colorbar(im0)
ax = fig.add_subplot(1, 2, 2)
im0 = ax.scatter(filtered_endo_tsneproj_saved['x'], filtered_endo_tsneproj_saved['y'],
s = 1, c = np.log10(filtered_endo_confounders['libsize']))
ax.axis('off');
ax.set_title('log10(library size)')
fig.colorbar(im0)
<matplotlib.colorbar.Colorbar at 0x7fe2453057f0>
filtered_endo_cluster_saved = pd.read_csv('/Users/rs3380/Dropbox/lung_tumor/lineage_analysis_061019/endothelial_analysis/endo_data/endo_filtered_leiden_50PCs_k30_res1.csv',
index_col=0, squeeze=True, names=['cluster'])
filtered_endo_cluster, _, _ = phenograph.cluster(filtered_endo_pcaproj.iloc[:,0:50], k = 30,
clustering_algo = 'leiden', resolution_parameter = 1.0)
Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11496090888977051 seconds Jaccard graph constructed in 2.217146873474121 seconds Running Leiden optimization Leiden completed in 0.17226195335388184 seconds Sorting communities by size, please wait ... PhenoGraph completed in 3.924635887145996 seconds
communities = {}
for k in [20, 25, 30, 35, 40, 45]:
print(k)
communities[str(k)], _, _ = phenograph.cluster(filtered_endo_pcaproj.iloc[:,0:50],k=k,
clustering_algo = 'leiden',
resolution_parameter = 1)
20 Finding 20 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11580204963684082 seconds Jaccard graph constructed in 2.4403061866760254 seconds Running Leiden optimization Leiden completed in 0.14300799369812012 seconds Sorting communities by size, please wait ... PhenoGraph completed in 3.796410083770752 seconds 25 Finding 25 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.11408829689025879 seconds Jaccard graph constructed in 2.190077304840088 seconds Running Leiden optimization Leiden completed in 0.14685702323913574 seconds Sorting communities by size, please wait ... PhenoGraph completed in 3.5612690448760986 seconds 30 Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.1191098690032959 seconds Jaccard graph constructed in 3.010128974914551 seconds Running Leiden optimization Leiden completed in 0.13943696022033691 seconds Sorting communities by size, please wait ... PhenoGraph completed in 4.356630802154541 seconds 35 Finding 35 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.17951607704162598 seconds Jaccard graph constructed in 2.729011058807373 seconds Running Leiden optimization Leiden completed in 0.22387194633483887 seconds Sorting communities by size, please wait ... PhenoGraph completed in 4.348363876342773 seconds 40 Finding 40 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.152756929397583 seconds Jaccard graph constructed in 2.6567351818084717 seconds Running Leiden optimization Leiden completed in 0.24489378929138184 seconds Sorting communities by size, please wait ... PhenoGraph completed in 4.516801118850708 seconds 45 Finding 45 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 0.1406400203704834 seconds Jaccard graph constructed in 2.783215045928955 seconds Running Leiden optimization Leiden completed in 0.2328200340270996 seconds Sorting communities by size, please wait ... PhenoGraph completed in 4.311401128768921 seconds
res_mat = np.zeros(shape = (6, 6))
for j, item in enumerate(communities.keys()):
for k, item2 in enumerate(communities.keys()):
res_mat[j, k] = adjusted_rand_score(communities[item], communities[item2])
plt.figure(figsize= (8, 6))
plt.imshow(res_mat, vmin = 0, vmax = 1, cmap = 'magma')
plt.colorbar()
plt.xticks(range(6), communities.keys());
plt.yticks(range(6), communities.keys());
#plt.grid()#which='minor', color='w', linestyle='-', linewidth=5)
plt.xlabel('k')
plt.ylabel('k')
plt.title('Adjusted Rand Index')
Text(0.5, 1.0, 'Adjusted Rand Index')
ENDOMARKERS = ['PECAM1','CDH5', # canonical EC
'TMEM100','FOXF1', # lung specific
'MGP','PLAT','GJA4','GJA5','EDN1','FBLN2','HEY1', # artery
'VWF','VCAM1','SLC6A2','CD200','BST1','BST2','CH25H', # large vessel, vein
'GPIHBP1','SEMA3C','CADM1','HILPDA', # capillary 2
'CAR4','FIBIN','CYP4B1','EDNRB','KDR', # capillary 1
'LYVE1','PROX1','THY1','MMRN1','CP','FGL2', # lymphatic
]
fig = plt.figure(figsize = (8*4, 6*9))
for j, item in enumerate(ENDOMARKERS):
ax = fig.add_subplot(9, 4, j + 1)
c = filtered_endo_normlog[item]
im1 = ax.scatter(filtered_endo_tsneproj_saved['x'], filtered_endo_tsneproj_saved['y'],
c = c, vmin = np.percentile(c, 1), vmax = np.percentile(c, 99))
ax.axis('off')
ax.set_title(item)
fig.colorbar(im1)
ENDOMARKERS_dict = {'canonical_EC': ['PECAM1','CDH5'],
'lung_specific': ['TMEM100','FOXF1'],
'artery': ['MGP','PLAT','GJA4','GJA5','EDN1','FBLN2','HEY1'], # artery
'large vessel/vein': ['VWF','VCAM1','SLC6A2','CD200','BST1','BST2','CH25H'], # large vessel, vein
'capillary 2': ['GPIHBP1','SEMA3C','CADM1','HILPDA'], # capillary 2
'capillary 1': ['CAR4','FIBIN','CYP4B1','EDNRB','KDR'], # capillary 1
'lymphatic': ['LYVE1','PROX1','THY1','MMRN1','CP','FGL2'] # lymphatic
}
col_markers = []
ct = 0
for item, values in ENDOMARKERS_dict.items():
col_markers = col_markers + [cols_use[ct]]*len(values)
ct = ct + 1
endo_sub_data = filtered_endo_normlog[ENDOMARKERS].copy()
endo_sub_data['Cluster'] = list(filtered_endo_cluster_saved.values)
endo_sub_data_avg = endo_sub_data.groupby('Cluster').mean()
import matplotlib as mpl
mpl.style.use('default')
g = sns.clustermap(endo_sub_data_avg, standard_scale = 1, col_colors = col_markers, col_cluster = False,
figsize = (12, 6), linewidth = 0.2, cbar_pos=(1, .3, .01, .4))
# Draw the legend bar for the classes
for ct, label in enumerate([j for j in ENDOMARKERS_dict.keys()]):
g.ax_col_dendrogram.bar(0, 0, color=cols_use[ct],
label=label, linewidth=0)
g.ax_col_dendrogram.legend(ncol=2, bbox_to_anchor = (0, 0.8))
<matplotlib.legend.Legend at 0x7fdf86c41700>
res = utils.run_diffusion_maps(filtered_endo_pcaproj.iloc[:,0:50], knn = 30)
Determing nearest neighbor graph using euclidean distance metric
def impute_expression(T,expression,TSTEPS=4):
T_step = T
# impute values first, then keep stepping to speed it up
T_imputed = T_step.dot(expression.values)
for i in range(1,TSTEPS):
print('step {}'.format(i+1))
T_imputed = T_step.dot(T_imputed)
expression_imputed = pd.DataFrame(T_imputed,index=expression.index, columns=expression.columns)
return expression_imputed
filtered_endo_imputed = impute_expression(res['T'], filtered_endo_normlog, TSTEPS=4)
step 2 step 3 step 4
fig = plt.figure(figsize = (8*4, 6*9))
for j, item in enumerate(ENDOMARKERS):
ax = fig.add_subplot(9, 4, j + 1)
c = filtered_endo_imputed[item]
im1 = ax.scatter(filtered_endo_tsneproj_saved['x'], filtered_endo_tsneproj_saved['y'],
c = c, cmap = 'Spectral_r')
ax.axis('off')
ax.set_title(item)
fig.colorbar(im1)
endo_sub_data = filtered_endo_imputed[ENDOMARKERS].copy()
endo_sub_data['Cluster'] = list(filtered_endo_cluster_saved.values)
endo_sub_data_avg = endo_sub_data.groupby('Cluster').mean()
import matplotlib as mpl
mpl.style.use('default')
g = sns.clustermap(endo_sub_data_avg, standard_scale = 1, col_colors = col_markers, col_cluster = False,
figsize = (12, 6), linewidth = 0.2, cbar_pos=(1, .3, .01, .4))
# Draw the legend bar for the classes
for ct, label in enumerate([j for j in ENDOMARKERS_dict.keys()]):
g.ax_col_dendrogram.bar(0, 0, color=cols_use[ct],
label=label, linewidth=0)
g.ax_col_dendrogram.legend(ncol=2, bbox_to_anchor = (0, 0.8))
<matplotlib.legend.Legend at 0x7fdf8828fc10>
IMMGENES = ['CX3CL1','LTB','ICOSL','RELB','TNF','CD40','IL4RA',
'RSPO3','WNT2','CCL21A','TGFBR2','TGFB1','TGFB3','IL21R','IL11RA1',
'PTGS2','IL1B','CCL6','CD74','H2-EB1','CXCL12','IL6RA','IL33','IL7',
'AREG','TOP2A','ISG15','STAT1','CXCL9','CXCL10','CD274','IDO1']
fig = plt.figure(figsize = (8*4, 6*9))
for j, item in enumerate(IMMGENES):
ax = fig.add_subplot(9, 4, j + 1)
c = filtered_endo_normlog[item]
im1 = ax.scatter(filtered_endo_tsneproj_saved['x'], filtered_endo_tsneproj_saved['y'],
c = c, vmin = np.percentile(c, 1), vmax = np.percentile(c, 99))
ax.axis('off')
ax.set_title(item)
fig.colorbar(im1)
endo_sub_data = filtered_endo_normlog[IMMGENES].copy()
endo_sub_data['Cluster'] = list(filtered_endo_cluster_saved.values)
endo_sub_data_avg = endo_sub_data.groupby('Cluster').mean()
import matplotlib as mpl
mpl.style.use('default')
g = sns.clustermap(endo_sub_data_avg, standard_scale = 1, col_cluster = True,
figsize = (12, 6), linewidth = 0.2, cbar_pos=(1, .3, .01, .4))
# Draw the legend bar for the classes
#for ct, label in enumerate([j for j in ENDOMARKERS_dict.keys()]):
# g.ax_col_dendrogram.bar(0, 0, color=cols_use[ct],
# label=label, linewidth=0)
#g.ax_col_dendrogram.legend(ncol=2, bbox_to_anchor = (0, 0.8))
endo_sub_data = filtered_endo_imputed[IMMGENES].copy()
endo_sub_data['Cluster'] = list(filtered_endo_cluster_saved.values)
endo_sub_data_avg = endo_sub_data.groupby('Cluster').mean()
import matplotlib as mpl
mpl.style.use('default')
g = sns.clustermap(endo_sub_data_avg, standard_scale = 1, col_cluster = True,
figsize = (12, 6), linewidth = 0.2, cbar_pos=(1, .3, .01, .4))
endo_coarse_assn = {'artery_vein':[7],
'capillary_2':[1,2,3],
'capillary_1':[0,6],
'inflammatory_capillary':[8],
'lymphatic_endothelial':[4,5,9]
}
# individual cell assignments
endo_coarse_assn_clustmap = {}
for a,clusts in endo_coarse_assn.items():
for c in clusts:
endo_coarse_assn_clustmap[c] = a
endo_coarse = pd.Series(filtered_endo_cluster_saved.map(endo_coarse_assn_clustmap),
name='assignment')
fig = plt.figure(figsize = (8*2, 6*1))
ax = fig.add_subplot(1, 2, 1)
old_cluster = filtered_endo_cluster_saved.values
for j in np.unique(old_cluster):
ax.scatter(filtered_endo_tsneproj_saved['x'][old_cluster == j],
filtered_endo_tsneproj_saved['y'][old_cluster == j],
s = 1, c = cols[j], label = str(j))
ax.legend(markerscale = 5, fontsize = 10, ncol = 2)
ax.axis('off');
ax.set_title('Old clusters')
ax = fig.add_subplot(1, 2, 2)
new_cluster = endo_coarse
for j, item in enumerate(np.unique(new_cluster)):
ax.scatter(filtered_endo_tsneproj_saved['x'][new_cluster == item],
filtered_endo_tsneproj_saved['y'][new_cluster == item],
s = 1, c = cols[j], label = item)
ax.legend(markerscale = 5, fontsize = 10, ncol = 2)
ax.axis('off');
ax.set_title('New clusters')
Text(0.5, 1.0, 'New clusters')
Please see KP_model_data_filtering_part3.ipynb for rest of the analysis (Myeloid lineage).